NOTE: This document was generated with
sctransformversion 0.3.2.9003
With this vignette we explore whether sctransform can be used with Smart-Seq2 data and what changes to the default parameters might be needed.
As an example data set we use the Smart-Seq2 data from the Tabula muris project.
We have already loaded the data as a list facs with components counts and meta_data. Here we have already subsetted the data to contain only cells from bone marrow. There are a total of 5355 cells.
Distribution of total counts per cell
# we are going to ignore the spike-in controls
is_ercc <- grepl(pattern = "^ERCC-", x = rownames(facs$counts))
facs$meta_data$counts <- sparseMatrixStats::colSums2(x = facs$counts, rows = !is_ercc)
p1 <- ggplot(facs$meta_data, aes(plate, log10(counts))) + geom_boxplot(aes(fill = mouse.id)) +
coord_flip()
show(p1)Distribution of genes detected per cell
facs$meta_data$genes <- sparseMatrixStats::colSums2(x = facs$counts > 0, rows = !is_ercc)
p1 <- ggplot(facs$meta_data, aes(plate, log10(genes))) + geom_boxplot(aes(fill = mouse.id)) +
coord_flip()
show(p1)Plot some more cell attributes: Distribution of spike in controls (ERCC), ribosomal genes, mitochondrial read fraction
facs$meta_data$pct_ercc <- sparseMatrixStats::colSums2(x = facs$counts, rows = is_ercc)/sparseMatrixStats::colSums2(x = facs$counts) *
100
is_ribo <- grepl("^Mrp[sl]", rownames(facs$counts))
facs$meta_data$pct_ribo <- sparseMatrixStats::colSums2(x = facs$counts, rows = is_ribo)/facs$meta_data$counts *
100
mitocarta <- readxl::read_excel(path = file.path(base_dir, "data", "Mouse.MitoCarta3.0.xls"),
sheet = 2)
mito_genes <- mitocarta$Symbol[1:400]
is_mito <- rownames(facs$counts) %in% mito_genes
facs$meta_data$pct_mito <- sparseMatrixStats::colSums2(x = facs$counts, rows = is_mito)/facs$meta_data$counts *
100
entropy <- function(p) {
-sum(p * log(p), na.rm = TRUE)
}
facs$meta_data$entropy <- apply(t(facs$counts[!is_ercc, ])/facs$meta_data$counts,
1, entropy)
p1 <- ggplot(facs$meta_data, aes(plate, pct_ercc)) + geom_boxplot(aes(fill = mouse.id)) +
coord_flip()
p2 <- ggplot(facs$meta_data, aes(plate, pct_ribo)) + geom_boxplot(aes(fill = mouse.id)) +
coord_flip()
p3 <- ggplot(facs$meta_data, aes(plate, pct_mito)) + geom_boxplot(aes(fill = mouse.id)) +
coord_flip()
p4 <- ggplot(facs$meta_data, aes(plate, exp(entropy))) + geom_boxplot(aes(fill = mouse.id)) +
coord_flip()
show(((p1 + p2)/(p3 + p4)) + plot_layout(guides = "collect"))Number of cells per mouse
| mouse.id | n |
|---|---|
| 3_10_M | 1266 |
| 3_38_F | 1081 |
| 3_39_F | 808 |
| 3_8_M | 1177 |
| 3_9_M | 1023 |
sctransform::vstvst_out <- vst(umi = facs$counts[!is_ercc, ], cell_attr = facs$meta_data, method = "qpoisson",
return_cell_attr = TRUE, verbosity = 1)
#> Calculating cell attributes from input UMI matrix: log_umi
#> Variance stabilizing transformation of count matrix of size 17479 by 5355
#> Model formula is y ~ log_umi
#> Get Negative Binomial regression parameters per gene
#> Using 2000 genes, 5355 cells
#> There are 6 estimated thetas smaller than 1e-07 - will be set to 1e-07
#> Found 12 outliers - those will be ignored in fitting/regularization step
#> Second step: Get residuals using fitted parameters for 17479 genes
#> Calculating gene attributes
#> Wall clock passed: Time difference of 18.61333 secsp1 <- ggplot(vst_out$cell_attr, aes(log_umi)) + geom_histogram(binwidth = 0.05) +
xlab("Total counts per cell [log10]") + ylab("Number of cells")
p2 <- ggplot(vst_out$gene_attr, aes(log10(gmean))) + geom_histogram(binwidth = 0.05) +
xlab("Geometric mean of gene [log10]") + ylab("Number of genes")
show(p1 | p2)It looks like there is a relationship between the mean of a gene and the model parameters and it is similar in style to what we see with UMI data. However, we see a much higher variance for all genes compared to UMI data where we usually see var = mean for genes with mean < 10.
ga <- tibble::rownames_to_column(vst_out$gene_attr, var = "gene") %>% mutate(highlight = rank(-residual_variance) <=
25)
p1 <- ggplot(ga, aes(log10(gmean), sqrt(residual_variance), label = gene)) + geom_point() +
geom_smooth() + geom_text_repel(data = filter(ga, highlight), size = 3, color = "red") +
geom_point(data = filter(ga, highlight), size = 0.66, color = "red") + ggtitle("Residual variance as function of gene mean")
show(p1)
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'p1 <- ggplot(vst_out$gene_attr, aes(residual_mean)) + geom_histogram(binwidth = 0.01) +
xlim(-3, 3)
p2 <- ggplot(vst_out$gene_attr, aes(residual_variance)) + geom_histogram(binwidth = 0.1) +
geom_vline(xintercept = 1, color = "red") + xlim(0, 10)
show(p1 + p2)The residual variance is quite covers a wide range is not as peaked around 1 as we are used to from UMI data. Perhaps this is not surprising since we do not have UMIs to remove technical noise introduced during PCR amplification.
As a result we might want to change the way we clip very high residuals to make sure signal from biologically variable genes is not dampened.
Look at the models of the top variable genes
goi <- filter(ga, rank(-residual_variance) < 5) %>% pull(gene)
plot_model(x = vst_out, umi = facs$counts, goi = goi, plot_residual = TRUE)In comparison, show non-variable genes with similar mean
goi2 <- sapply(goi, function(g) {
g_gmean <- filter(ga, gene == g) %>% pull(gmean)
(filter(ga, residual_variance < 1.25, residual_variance > 0.75) %>% arrange(abs(gmean -
g_gmean)) %>% pull(gene))[1]
})
plot_model(x = vst_out, umi = facs$counts, goi = goi2, plot_residual = TRUE)s <- CreateSeuratObject(counts = facs$counts[!is_ercc, ], meta.data = facs$meta_data)
# run sctransform with less additional clipping of the residuals and all cells
s <- SCTransform(s, verbose = FALSE, method = "qpoisson", clip.range = sqrt(ncol(s))/5 *
c(-1, 1), ncells = Inf)
# 'standard' workflow
s <- RunPCA(s, verbose = FALSE)
s <- RunUMAP(s, dims = 1:30, verbose = FALSE)
s <- FindNeighbors(s, dims = 1:30, verbose = FALSE)
s <- FindClusters(s, verbose = FALSE)
p1 <- DimPlot(s, label = TRUE) + NoLegend() + ggtitle("Unsupervised clustering") +
theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank())
p2 <- DimPlot(s, label = TRUE, group.by = "subtissue") + ggtitle("Meta data subtissue label") +
theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank())
show(p1 + p2)Find the top markers of the unsupervised clusters
de_res <- diff_mean_test(y = GetAssayData(s, assay = "SCT", slot = "counts"), group_labels = s$seurat_clusters,
R = 49, mean_th = 10, only_pos = TRUE, only_top_n = 222, verbosity = 0)
top_markers <- group_by(de_res, group1) %>% filter(rank(-zscore, ties.method = "first") <=
4 | rank(-log2FC, ties.method = "first") <= 4) %>% select(group1, gene, mean1,
mean2, log2FC, zscore, emp_pval_adj)
p1 <- ggplot(de_res, aes(pmin(log2FC, 10), pmin(log10(zscore), 4))) + geom_point(aes(color = emp_pval_adj <
0.05)) + geom_point(data = top_markers, color = "deeppink") + geom_text_repel(data = top_markers,
mapping = aes(label = gene)) + facet_wrap(~group1, ncol = 5) + theme(legend.position = "bottom")
show(p1)Table of top markers per cluster
Seurat heatmap
marker_genes <- group_by(de_res, group1) %>% filter(rank(-log2FC, ties.method = "first") <=
4) %>% pull(gene)
# make sure all markers are in the scale.data slot (by default only the highly
# variable genes are there)
s <- GetResidual(s, features = marker_genes, verbose = FALSE)
DoHeatmap(s, features = marker_genes, slot = "scale.data")s$log_counts <- log10(s$nCount_RNA)
FeaturePlot(s, features = c("pct_ercc", "pct_mito", "pct_ribo", "log_counts"))Contribution of mice to clusters
p1 <- group_by(s@meta.data, seurat_clusters, mouse.id) %>% summarise(cells = n()) %>%
ggplot(aes(seurat_clusters, cells, fill = mouse.id)) + geom_bar(stat = "identity")
p2 <- group_by(s@meta.data, seurat_clusters) %>% mutate(n = n()) %>% group_by(seurat_clusters,
mouse.id) %>% summarise(frequency = n()/n[1]) %>% ggplot(aes(seurat_clusters,
frequency, fill = mouse.id)) + geom_bar(stat = "identity")
show((p1 + p2) + plot_layout(guides = "collect"))s <- CreateSeuratObject(counts = facs$counts[!is_ercc, ], meta.data = facs$meta_data)
s <- NormalizeData(s, normalization.method = "LogNormalize", verbose = FALSE)
s <- FindVariableFeatures(s, verbose = FALSE)
s <- ScaleData(s, verbose = FALSE)
s <- RunPCA(s, verbose = FALSE)
s <- RunUMAP(s, dims = 1:30, verbose = FALSE)
s <- FindNeighbors(s, dims = 1:30, verbose = FALSE)
s <- FindClusters(s, verbose = FALSE)
p1 <- DimPlot(s, label = TRUE) + NoLegend() + ggtitle("Unsupervised clustering") +
theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank())
p2 <- DimPlot(s, label = TRUE, group.by = "subtissue") + ggtitle("Meta data subtissue label") +
theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank())
show(p1 + p2)Find the top markers of the unsupervised clusters
s_markers <- FindAllMarkers(s, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 1,
verbose = FALSE)
top_markers <- group_by(s_markers, cluster) %>% filter(rank(-avg_log2FC, ties.method = "first") <=
4) %>% select(cluster, gene, avg_log2FC, pct.1, pct.2, p_val_adj)Table of top markers per cluster
Seurat heatmaps$log_counts <- log10(s$nCount_RNA)
FeaturePlot(s, features = c("pct_ercc", "pct_mito", "pct_ribo", "log_counts"))Contribution of mice to clusters
p1 <- group_by(s@meta.data, seurat_clusters, mouse.id) %>% summarise(cells = n()) %>%
ggplot(aes(seurat_clusters, cells, fill = mouse.id)) + geom_bar(stat = "identity")
p2 <- group_by(s@meta.data, seurat_clusters) %>% mutate(n = n()) %>% group_by(seurat_clusters,
mouse.id) %>% summarise(frequency = n()/n[1]) %>% ggplot(aes(seurat_clusters,
frequency, fill = mouse.id)) + geom_bar(stat = "identity")
show((p1 + p2) + plot_layout(guides = "collect"))The count distribution for Smart-Seq2 data looks quite different from UMI-based data. Even low counts get are amplified such that there is a clear distinction between drop-outs (gene not detected) and low expression. In contrast, for UMI data this is a continuous gradient. The regression with a Negative Binomial model works, but is likely not the appropriate model. It is not cleat what benefits the default sctrasnsform workflow provides compared to the ‘standard’ log-normalization workflow currently used by Seurat.
Session info
sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Catalina 10.15.7
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] patchwork_1.1.0.9000 ggrepel_0.8.2 dplyr_1.0.2
#> [4] knitr_1.30 Seurat_3.9.9.9008 sctransform_0.3.2.9003
#> [7] reshape2_1.4.4 ggplot2_3.3.2 Matrix_1.2-18
#>
#> loaded via a namespace (and not attached):
#> [1] nlme_3.1-149 matrixStats_0.57.0 RcppAnnoy_0.0.16
#> [4] RColorBrewer_1.1-2 httr_1.4.2 tools_4.0.2
#> [7] DT_0.16 R6_2.5.0 irlba_2.3.3
#> [10] rpart_4.1-15 KernSmooth_2.23-17 uwot_0.1.8.9001
#> [13] mgcv_1.8-33 lazyeval_0.2.2 colorspace_2.0-0
#> [16] withr_2.3.0 tidyselect_1.1.0 gridExtra_2.3
#> [19] compiler_4.0.2 formatR_1.7 plotly_4.9.2.1
#> [22] labeling_0.4.2 scales_1.1.1 lmtest_0.9-38
#> [25] spatstat.data_1.4-3 ggridges_0.5.2 pbapply_1.4-3
#> [28] spatstat_1.64-1 goftest_1.2-2 stringr_1.4.0
#> [31] digest_0.6.27 spatstat.utils_1.17-0 rmarkdown_2.5
#> [34] pkgconfig_2.0.3 htmltools_0.5.0 MatrixGenerics_1.3.0
#> [37] sparseMatrixStats_1.3.1 limma_3.44.3 highr_0.8
#> [40] fastmap_1.0.1 readxl_1.3.1 htmlwidgets_1.5.2
#> [43] rlang_0.4.9 shiny_1.5.0 farver_2.0.3
#> [46] generics_0.0.2 zoo_1.8-8 jsonlite_1.7.2
#> [49] crosstalk_1.1.0.1 ica_1.0-2 magrittr_2.0.1
#> [52] Rcpp_1.0.5 munsell_0.5.0 abind_1.4-5
#> [55] reticulate_1.16 lifecycle_0.2.0 stringi_1.5.3
#> [58] yaml_2.2.1 MASS_7.3-53 Rtsne_0.15
#> [61] plyr_1.8.6 grid_4.0.2 parallel_4.0.2
#> [64] listenv_0.8.0 promises_1.1.1 crayon_1.3.4.9000
#> [67] deldir_0.1-29 miniUI_0.1.1.1 lattice_0.20-41
#> [70] cowplot_1.1.0 splines_4.0.2 tensor_1.5
#> [73] pillar_1.4.7 igraph_1.2.6 future.apply_1.6.0
#> [76] codetools_0.2-16 leiden_0.3.3 glue_1.4.2
#> [79] evaluate_0.14 data.table_1.13.2 vctrs_0.3.5
#> [82] png_0.1-7 httpuv_1.5.4 cellranger_1.1.0
#> [85] gtable_0.3.0 RANN_2.6.1 purrr_0.3.4
#> [88] polyclip_1.10-0 tidyr_1.1.2 future_1.19.1
#> [91] xfun_0.19 rsvd_1.0.3 mime_0.9
#> [94] xtable_1.8-4 RSpectra_0.16-0 later_1.1.0.1
#> [97] survival_3.2-3 viridisLite_0.3.0 tibble_3.0.4
#> [100] cluster_2.1.0 globals_0.13.1 fitdistrplus_1.1-1
#> [103] ellipsis_0.3.1 ROCR_1.0-11Runtime